import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
wine_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine_names = ['Cultivar', 'Alcohol', 'Malic_acid', 'Ash',
'Alcalinity_of_ash', 'Magnesium', 'Total_phenols',
'Flavanoids', 'Nonflavanoid_phenols', 'Proanthocyanin', 'Color_intensity',
'Hue', 'OD280_OD315', 'Proline']
wine_data = pd.read_csv(wine_url, names=wine_names)
wine_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Cultivar 178 non-null int64 1 Alcohol 178 non-null float64 2 Malic_acid 178 non-null float64 3 Ash 178 non-null float64 4 Alcalinity_of_ash 178 non-null float64 5 Magnesium 178 non-null int64 6 Total_phenols 178 non-null float64 7 Flavanoids 178 non-null float64 8 Nonflavanoid_phenols 178 non-null float64 9 Proanthocyanin 178 non-null float64 10 Color_intensity 178 non-null float64 11 Hue 178 non-null float64 12 OD280_OD315 178 non-null float64 13 Proline 178 non-null int64 dtypes: float64(11), int64(3) memory usage: 19.6 KB
wine_data.isna().sum()
Cultivar 0 Alcohol 0 Malic_acid 0 Ash 0 Alcalinity_of_ash 0 Magnesium 0 Total_phenols 0 Flavanoids 0 Nonflavanoid_phenols 0 Proanthocyanin 0 Color_intensity 0 Hue 0 OD280_OD315 0 Proline 0 dtype: int64
wine_data.nunique()
Cultivar 3 Alcohol 126 Malic_acid 133 Ash 79 Alcalinity_of_ash 63 Magnesium 53 Total_phenols 97 Flavanoids 132 Nonflavanoid_phenols 39 Proanthocyanin 101 Color_intensity 132 Hue 78 OD280_OD315 122 Proline 121 dtype: int64
wine_data.Cultivar.value_counts()
2 71 1 59 3 48 Name: Cultivar, dtype: int64
wine_data['Cultivar'] = wine_data.Cultivar.astype('category')
sns.pairplot(data = wine_data, hue='Cultivar')
plt.show()
13 * 13
169
sns.set_style('whitegrid')
sns.pairplot(data = wine_data,
vars=['Color_intensity', 'Proline', 'Flavanoids', 'OD280_OD315'],
hue='Cultivar')
plt.show()
### look at a boxplots to understand the sumary statistics of the features
fig, ax = plt.subplots(figsize=(18, 6))
wine_data.boxplot(ax = ax, showmeans=True)
plt.show()
### make the boxplot figure in seaborn
sns.catplot(data = wine_data, kind='box', aspect=3)
plt.show()
wine_data.head()
| Cultivar | Alcohol | Malic_acid | Ash | Alcalinity_of_ash | Magnesium | Total_phenols | Flavanoids | Nonflavanoid_phenols | Proanthocyanin | Color_intensity | Hue | OD280_OD315 | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
| 1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
| 2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
| 3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
| 4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
wine_features = wine_data.select_dtypes('number').copy()
wine_features.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Alcohol 178 non-null float64 1 Malic_acid 178 non-null float64 2 Ash 178 non-null float64 3 Alcalinity_of_ash 178 non-null float64 4 Magnesium 178 non-null int64 5 Total_phenols 178 non-null float64 6 Flavanoids 178 non-null float64 7 Nonflavanoid_phenols 178 non-null float64 8 Proanthocyanin 178 non-null float64 9 Color_intensity 178 non-null float64 10 Hue 178 non-null float64 11 OD280_OD315 178 non-null float64 12 Proline 178 non-null int64 dtypes: float64(11), int64(2) memory usage: 18.2 KB
Xwine = StandardScaler().fit_transform(wine_features)
print( type(Xwine) )
<class 'numpy.ndarray'>
Check that the standardization was "successful".
sns.catplot(data = pd.DataFrame(Xwine, columns=wine_features.columns), kind='box', aspect=3)
plt.show()
Standardization first CENTERS and then SCALES. Where CENTER refers to subtracting the SAMPLE AVERAGE and SCALE refers to dividing by the SAMPLE STANDARD DEVIATION.
If we have a variable, $x_{n,d}$, the standardized value is:
$$ \tilde{x}_{n,d} = \frac{x_{n,d} - \mathrm{mean}\left(x_{:,d}\right)}{\mathrm{std}\left(x_{:,d}\right)} $$wine_pcs = PCA(n_components=2).fit_transform(Xwine)
print( type(wine_pcs) )
<class 'numpy.ndarray'>
print( wine_pcs.shape )
(178, 2)
wine_pcs_df = pd.DataFrame(wine_pcs, columns=['pc1', 'pc2'])
wine_pcs_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pc1 178 non-null float64 1 pc2 178 non-null float64 dtypes: float64(2) memory usage: 2.9 KB
sns.relplot(data = wine_pcs_df, x='pc1', y='pc2', s=100, height=7)
plt.show()
Plot the Cultivar region on the PC scatter plot.
wine_pcs_df['Cultivar'] = wine_data.Cultivar
wine_pcs_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pc1 178 non-null float64 1 pc2 178 non-null float64 2 Cultivar 178 non-null category dtypes: category(1), float64(2) memory usage: 3.2 KB
sns.relplot(data = wine_pcs_df, x='pc1', y='pc2', hue='Cultivar', s=100, height=7)
plt.show()
Find the optimal number of clusters using the Silhouette Coefficient.
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
sil_coef = []
K = range(2, 16)
for k in K:
km = KMeans(n_clusters=k, random_state=121, n_init=121, max_iter=500)
k_label = km.fit_predict(Xwine)
sil_coef.append(silhouette_score(Xwine, k_label))
sil_coef
[0.25931695553182543, 0.2848589191898987, 0.25987462789190524, 0.22632341517700455, 0.24126027081639909, 0.20127537489228578, 0.15222994325257241, 0.13842020815454018, 0.14756387910379007, 0.148730008321073, 0.1407770602426679, 0.13612017523381112, 0.1335452147864896, 0.15130648783922934]
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(K, sil_coef, 'bo-')
ax.set_xlabel('number of clusters')
ax.set_ylabel('average silhouette coefficient')
plt.show()
from scipy.cluster import hierarchy
We still want to use the standardized features.
hclust_complete = hierarchy.complete(Xwine)
print( type(hclust_complete) )
<class 'numpy.ndarray'>
Visualize the hierarchical cluster results via a DENDROGRAM.
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_complete)
plt.show()
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_complete, no_labels=True)
plt.show()
Single linkage.
hclust_single = hierarchy.single(Xwine)
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_single, no_labels=True)
plt.show()
Average linkage.
hclust_average = hierarchy.average(Xwine)
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_average, no_labels=True)
plt.show()
Centroid linkage.
hclust_centroid = hierarchy.centroid(Xwine)
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_centroid, no_labels=True)
plt.show()
hclust_ward = hierarchy.ward(Xwine)
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(hclust_ward, no_labels=True)
plt.show()
ward_cut_3 = hierarchy.cut_tree(hclust_ward, n_clusters=3)
print( type(ward_cut_3) )
<class 'numpy.ndarray'>
print( ward_cut_3.shape )
(178, 1)
ward_cut_3[:3, ]
array([[0],
[0],
[0]])
print( ward_cut_3.ndim )
2
We need to convert the 2D array into a 1D array.
ward_cut_3.ravel().ndim
1
ward_cut_3.ravel().shape
(178,)
wine_pcs_df['hclust_3'] = pd.Series(ward_cut_3.ravel(), index=wine_pcs_df.index)
wine_pcs_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pc1 178 non-null float64 1 pc2 178 non-null float64 2 Cultivar 178 non-null category 3 hclust_3 178 non-null int32 dtypes: category(1), float64(2), int32(1) memory usage: 3.9 KB
wine_pcs_df.hclust_3 = wine_pcs_df.hclust_3.astype('category')
wine_pcs_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pc1 178 non-null float64 1 pc2 178 non-null float64 2 Cultivar 178 non-null category 3 hclust_3 178 non-null category dtypes: category(2), float64(2) memory usage: 3.5 KB
wine_pcs_df.hclust_3.value_counts()
0 64 1 58 2 56 Name: hclust_3, dtype: int64
sns.catplot(data = wine_pcs_df, x='hclust_3', kind='count')
plt.show()
sns.relplot(data = wine_pcs_df, x='pc1', y='pc2', hue='hclust_3', s=100, height=7)
plt.show()
sns.relplot(data = wine_pcs_df, x='pc1', y='pc2', hue='hclust_3',
col='Cultivar',
s=200, height=7)
plt.show()
Use a heat map to look at the cross-tabulation between the cluster assignment and the known Cultivar group.
pd.crosstab(wine_pcs_df.hclust_3, wine_pcs_df.Cultivar, margins=True)
| Cultivar | 1 | 2 | 3 | All |
|---|---|---|---|---|
| hclust_3 | ||||
| 0 | 59 | 5 | 0 | 64 |
| 1 | 0 | 58 | 0 | 58 |
| 2 | 0 | 8 | 48 | 56 |
| All | 59 | 71 | 48 | 178 |
fig, ax = plt.subplots(figsize=(9,7))
sns.heatmap(pd.crosstab(wine_pcs_df.hclust_3, wine_pcs_df.Cultivar, margins=True),
annot=True, annot_kws={'size': 25})
plt.show()
feature_names = wine_features.columns
wine_features['hclust_3'] = wine_pcs_df.hclust_3
wine_features.head()
| Alcohol | Malic_acid | Ash | Alcalinity_of_ash | Magnesium | Total_phenols | Flavanoids | Nonflavanoid_phenols | Proanthocyanin | Color_intensity | Hue | OD280_OD315 | Proline | hclust_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 0 |
| 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 | 0 |
| 2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 | 0 |
| 3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 | 0 |
| 4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 0 |
Reshape the wide format into a long format.
lf = wine_features.melt(id_vars=['hclust_3'], value_vars=feature_names, ignore_index=True)
lf
| hclust_3 | variable | value | |
|---|---|---|---|
| 0 | 0 | Alcohol | 14.23 |
| 1 | 0 | Alcohol | 13.20 |
| 2 | 0 | Alcohol | 13.16 |
| 3 | 0 | Alcohol | 14.37 |
| 4 | 0 | Alcohol | 13.24 |
| ... | ... | ... | ... |
| 2309 | 2 | Proline | 740.00 |
| 2310 | 2 | Proline | 750.00 |
| 2311 | 2 | Proline | 835.00 |
| 2312 | 2 | Proline | 840.00 |
| 2313 | 2 | Proline | 560.00 |
2314 rows × 3 columns
Summarize each variable within each cluster.
sns.catplot(data = lf, x='hclust_3', y = 'value', col='variable', kind='box',
col_wrap=5,
meanprops={'marker':'o', 'markerfacecolor':'white', 'markeredgecolor':'black',
'markersize':10},
sharey=False, showmeans=True)
plt.show()